{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "import pandas as pd \n",
    "import os\n",
    "import shutil\n",
    "from pathlib import Path\n",
    "from tqdm import tqdm\n",
    "save_dir = Path(\"opendata\")\n",
    "if save_dir.exists:\n",
    "    shutil.rmtree(save_dir, ignore_errors=True)\n",
    "\n",
    "os.makedirs(name=save_dir, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "processsssssssssss:   0%|          | 0/6 [00:00<?, ?it/s]Found cached dataset json (/root/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--generated_chat_0.4M-b65d3913b01e68e0/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a5154ed9933f4b91b79fea5beef23c56",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "processsssssssssss:  17%|█▋        | 1/6 [00:05<00:27,  5.44s/it]Found cached dataset json (/root/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--school_math_0.25M-01ed2660a3b251c0/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cae43eff961f4213a1fdba51a98c75f1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "processsssssssssss:  33%|███▎      | 2/6 [00:09<00:17,  4.34s/it]Found cached dataset json (/root/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--train_2M_CN-9f5684b36fb958f4/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2c06f6842fc741f88ac70a78e305cb91",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "processsssssssssss:  50%|█████     | 3/6 [00:18<00:20,  6.92s/it]Found cached dataset json (/root/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--train_1M_CN-c99dfb2fac2ab434/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a1bb129f58f64da6a6da825eede617da",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "processsssssssssss:  67%|██████▋   | 4/6 [00:23<00:11,  5.93s/it]Found cached dataset json (/root/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--train_0.5M_CN-30591af9a26b6c1b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8de4a8cf55da41f794fb7e495b0b5f93",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "processsssssssssss:  83%|████████▎ | 5/6 [00:26<00:05,  5.03s/it]Found cached dataset json (/root/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--multiturn_chat_0.8M-9e895b626c42bf3b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8733f9026764443996679432b5c4d6fb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "processsssssssssss: 100%|██████████| 6/6 [00:32<00:00,  5.47s/it]\n"
     ]
    }
   ],
   "source": [
    "# belle_list = ['BelleGroup/generated_chat_0.4M']\n",
    "belle_list = ['BelleGroup/generated_chat_0.4M', 'BelleGroup/school_math_0.25M', 'BelleGroup/train_2M_CN', 'BelleGroup/train_1M_CN',\n",
    "              'BelleGroup/train_0.5M_CN', 'BelleGroup/multiturn_chat_0.8M']\n",
    "\n",
    "datasets_class = \"BelleGroup\"\n",
    "for index, temp_data_name in tqdm(enumerate(belle_list), total=len(belle_list), desc=\"processsssssssssss\"):\n",
    "    data1 = load_dataset(path=temp_data_name)\n",
    "    data1['train'].to_pandas().head(20000).to_json(save_dir.joinpath(f\"data_{index}_{datasets_class}.json\"), force_ascii=False, orient='records')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# belle_list = ['BelleGroup/generated_chat_0.4M', 'BelleGroup/school_math_0.25M', 'BelleGroup/train_2M_CN', 'BelleGroup/train_1M_CN',\n",
    "#               'BelleGroup/train_0.5M_CN', 'BelleGroup/multiturn_chat_0.8M']\n",
    "\n",
    "# datasets_class = \"BelleGroup\"\n",
    "# for index, temp_data_name in tqdm(enumerate(belle_list), total=len(belle_list), desc=\"processsssssssssss\"):\n",
    "#     data1 = load_dataset(path=temp_data_name)\n",
    "#     data1['train'].to_pandas().head(2000).pipe(\n",
    "#         lambda x: x.assign(**{\n",
    "#             'q': x.apply(lambda j: f'{j[\"instruction\"]}\\n{j[\"input\"]}', axis=1),\n",
    "#             'a': x['output']\n",
    "#         })\n",
    "#     )[['q', 'a']].to_json(save_dir.joinpath(f\"data_{index}_{datasets_class}.json\"), force_ascii=False, orient='records')\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mynet",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
